combine the two files(UNSW_NB15_training-set_csc215.csv and UNSW_NB15_test-set_csc215.csv) and create new csv with combined data combined_csv.csv
**Note:
import os
import glob
import pandas as pd
os.chdir(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Files")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
dummies = pd.get_dummies(df[name])
for x in dummies.columns:
dummy_name = "{}-{}".format(name, x)
df[dummy_name] = dummies[x]
df.drop(name, axis=1, inplace=True)
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
le = preprocessing.LabelEncoder()
df[name] = le.fit_transform(df[name])
return le.classes_
# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
if mean is None:
mean = df[name].mean()
if sd is None:
sd = df[name].std()
df[name] = (df[name] - mean) / sd
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from sklearn import metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
path =r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Files"
#Fetch the dataset from combined csv file
filename = os.path.join(path,"combined_csv.csv")
#counting no.of rows of combined csv file
file = open(filename)
numline = len(file.readlines())
print (numline)
df = pd.read_csv(filename,na_values=['NA','?','-'])
df
# return rows with one or more nulls
df[df.isnull().any(axis=1)]
#Step-1: Remove Null values
df = df.dropna()
df
#Step-2 Encoding Data
#Encode non-target features this way using encode_text_dummy function
#One Hot Encoding
encode_text_dummy(df, 'proto')
encode_text_dummy(df, 'state')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'attack_cat')
encode_text_dummy(df, 'is_sm_ips_ports')
# Remove Unnecessary Columns
df.drop('id',1,inplace=True)
#Step-3 Normalization
#Normalization of Numeric Columns
encode_numeric_zscore(df, 'dur')
encode_numeric_zscore(df, 'spkts')
encode_numeric_zscore(df, 'dpkts')
encode_numeric_zscore(df, 'sbytes')
encode_numeric_zscore(df, 'dbytes')
encode_numeric_zscore(df, 'rate')
encode_numeric_zscore(df, 'sttl')
encode_numeric_zscore(df, 'dttl')
encode_numeric_zscore(df, 'sload')
encode_numeric_zscore(df, 'dload')
encode_numeric_zscore(df, 'sloss')
encode_numeric_zscore(df, 'dloss')
encode_numeric_zscore(df, 'sinpkt')
encode_numeric_zscore(df, 'dinpkt')
encode_numeric_zscore(df, 'sjit')
encode_numeric_zscore(df, 'djit')
encode_numeric_zscore(df, 'swin')
encode_numeric_zscore(df, 'stcpb')
encode_numeric_zscore(df, 'dtcpb')
encode_numeric_zscore(df, 'dwin')
encode_numeric_zscore(df, 'tcprtt')
encode_numeric_zscore(df, 'synack')
encode_numeric_zscore(df, 'ackdat')
encode_numeric_zscore(df, 'smean')
encode_numeric_zscore(df, 'dmean')
encode_numeric_zscore(df, 'trans_depth')
encode_numeric_zscore(df, 'response_body_len')
encode_numeric_zscore(df, 'ct_srv_src')
encode_numeric_zscore(df, 'ct_state_ttl')
encode_numeric_zscore(df, 'ct_dst_ltm')
encode_numeric_zscore(df, 'ct_src_dport_ltm')
encode_numeric_zscore(df, 'ct_dst_sport_ltm')
encode_numeric_zscore(df, 'ct_dst_src_ltm')
encode_numeric_zscore(df, 'is_ftp_login')
encode_numeric_zscore(df, 'ct_ftp_cmd')
encode_numeric_zscore(df, 'ct_flw_http_mthd')
encode_numeric_zscore(df, 'ct_src_ltm')
encode_numeric_zscore(df, 'ct_srv_dst')
df
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(names))
plt.xticks(tick_marks, names, rotation=45)
plt.yticks(tick_marks, names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y):
fpr, tpr, thresholds = roc_curve(y, pred)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
#For Fully Connected Neural Network Produce final feature vectors (x) and expected output (y)
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
result = []
for x in df.columns:
if x != target:
result.append(x)
# find out the type of the target column.
target_type = df[target].dtypes
target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
# Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
if target_type in (np.int64, np.int32):
# Classification
dummies = pd.get_dummies(df[target])
return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
else:
# Regression
return df[result].values.astype(np.float32), df[target].values.astype(np.float32)
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from sklearn import metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
#For Fully Connected Neural Network Produce final feature vectors (x) and expected output (y)
x,y = to_xy(df,"label")
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.32, random_state=42)
x_train.shape
y_train.shape
x_test.shape
y_test.shape
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(names))
plt.xticks(tick_marks, names, rotation=45)
plt.yticks(tick_marks, names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y):
fpr, tpr, thresholds = roc_curve(y, pred)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
#activation function as tanh optimizer as sgd
# Define ModelCheckpoint outside the loop
checkpointer = ModelCheckpoint(filepath=r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\bst_weights_f_1.hdf5", verbose=2, save_best_only=True) # save best model
for i in range(5):
print(i)
# Build network
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], activation='tanh'))
model.add(Dense(5, activation='tanh'))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='sgd')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
print('Training finished...Loading the best model')
print()
model.load_weights(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\bst_weights_f_1.hdf5")
# load weights from best model
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_true = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_true, pred)
print("Final accuracy: {}".format(score))
import numpy as np
from sklearn import metrics
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_true, pred)
print(cm)
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
#Precision
pr_score = metrics.precision_score(y_true, pred)
print("Precision score : {}".format(pr_score))
#Recall
re_score = metrics.recall_score(y_true, pred)
print("Recall score : {}".format(re_score))
#F1 Score
f1_score = metrics.f1_score(y_true, pred)
print("F1 score : {}".format(f1_score))
print()
print(classification_report(y_true, pred))
#Plot ROC Curve for Fully Connected Nueral Network
pred = model.predict(x_test)
pred = pred[:,1] # Only positive class (M)
plot_roc(pred,y_true)
#activation function as sigmoid optimizer as sgd
# Define ModelCheckpoint outside the loop
checkpointer = ModelCheckpoint(filepath=r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f2.hdf5", verbose=2, save_best_only=True) # save best model
for i in range(5):
print(i)
# Build network
model = Sequential()
model.add(Dense(20, input_dim=x.shape[1], activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(6, activation='sigmoid'))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='sgd')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
print('Training finished...Loading the best model')
print()
model.load_weights(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f2.hdf5")
# load weights from best model
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_true = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_true, pred)
print("Final accuracy: {}".format(score))
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_true, pred)
print(cm)
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
pr_score = metrics.precision_score(y_true, pred)
print("Precision score : {}".format(pr_score))
re_score = metrics.recall_score(y_true, pred)
print("Recall score : {}".format(re_score))
f1_score = metrics.f1_score(y_true, pred)
print("F1 score : {}".format(f1_score))
print()
print(classification_report(y_true, pred))
#Plot ROC Curve for Fully Connected Nueral Network
pred = model.predict(x_test)
pred = pred[:,1] # Only positive class (M)
plot_roc(pred,y_true)
# Define ModelCheckpoint outside the loop
checkpointer = ModelCheckpoint(filepath=r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f3.hdf5", verbose=2, save_best_only=True) # save best model
for i in range(5):
print(i)
# Build network
model = Sequential()
model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
print('Training finished...Loading the best model')
print()
model.load_weights(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f3.hdf5")
# load weights from best model
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_true = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_true, pred)
print("Final accuracy: {}".format(score))
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_true, pred)
print(cm)
p_score = metrics.precision_score(y_true, pred)
print("Precision score : {}".format(p_score))
r_score = metrics.recall_score(y_true, pred)
print("Recall score : {}".format(r_score))
f1_score = metrics.f1_score(y_true, pred)
print("F1 score : {}".format(f1_score))
print()
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
print(classification_report(y_true, pred))
#Plot ROC Curve for Fully Connected Nueral Network
pred = model.predict(x_test)
pred = pred[:,1] # Only positive class (M)
plot_roc(pred,y_true)
# Define ModelCheckpoint outside the loop
checkpointer = ModelCheckpoint(filepath=r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f4.hdf5", verbose=2, save_best_only=True) # save best model
for i in range(5):
print(i)
# Build network
model = Sequential()
model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='sgd')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
print('Training finished...Loading the best model')
print()
model.load_weights(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f4.hdf5")
# load weights from best model
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_true = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_true, pred)
print("Final accuracy: {}".format(score))
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_true, pred)
print(cm)
precission = metrics.precision_score(y_true, pred)
print("Precision score : {}".format(precission))
recall = metrics.recall_score(y_true, pred)
print("Recall score : {}".format(recall))
f1_score = metrics.f1_score(y_true, pred)
print("F1 score : {}".format(f1_score))
print()
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
print(classification_report(y_true, pred))
#Plot ROC Curve for Fully Connected Nueral Network
pred = model.predict(x_test)
pred = pred[:,1] # Only positive class (M)
plot_roc(pred,y_true)
# Define ModelCheckpoint outside the loop
checkpointer = ModelCheckpoint(filepath=r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f7.hdf5", verbose=2, save_best_only=True) # save best model
for i in range(5):
print(i)
# Build network
model = Sequential()
model.add(Dense(20, input_dim=x.shape[1], activation='tanh'))
model.add(Dense(5, activation='tanh'))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
print('Training finished...Loading the best model')
print()
model.load_weights(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f7.hdf5")
# load weights from best model
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_true = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_true, pred)
print("Final accuracy: {}".format(score))
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_true, pred)
print(cm)
pr = metrics.precision_score(y_true, pred)
print("Precision score : {}".format(pr))
recall = metrics.recall_score(y_true, pred)
print("Recall score : {}".format(recall))
f1_score = metrics.f1_score(y_true, pred)
print("F1 score : {}".format(f1_score))
print()
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
print(classification_report(y_true, pred))
#Plot ROC Curve for Fully Connected Nueral Network
pred = model.predict(x_test)
pred = pred[:,1] # Only positive class (M)
plot_roc(pred,y_true)
# Define ModelCheckpoint outside the loop
checkpointer = ModelCheckpoint(filepath=r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f6.hdf5", verbose=2, save_best_only=True) # save best model
for i in range(5):
print(i)
# Build network
model = Sequential()
model.add(Dense(40, input_dim=x.shape[1], activation='tanh'))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
print('Training finished...Loading the best model')
print()
model.load_weights(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Bestweights\bst_weights_f6.hdf5")
# load weights from best model
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_true = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_true, pred)
print("Final accuracy: {}".format(score))
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_true, pred)
print(cm)
pr = metrics.precision_score(y_true, pred)
print("Precision score : {}".format(pr))
re = metrics.recall_score(y_true, pred)
print("Recall score : {}".format(re))
f1_score = metrics.f1_score(y_true, pred)
print("F1 score : {}".format(f1_score))
print()
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
print(classification_report(y_true, pred))
#Plot ROC Curve for Fully Connected Nueral Network
pred = model.predict(x_test)
pred = pred[:,1] # Only positive class (M)
plot_roc(pred,y_true)
#Modified to_xy to get 1D y
def to_XY(df, target):
result = []
for x in df.columns:
if x != target:
result.append(x)
# find out the type of the target column.
target_type = df[target].dtypes
target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
# Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
if target_type in (np.int64, np.int32):
# Classification
dummies = pd.get_dummies(df[target])
return df[result].values.astype(np.float32), df[target].values.astype(np.float32)
else:
# Regression
return df[result].values.astype(np.float32), df[target].values.astype(np.float32)
x,y = to_XY(df,"label")
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.32,random_state=0)
from sklearn.linear_model import LogisticRegression
# instantiate the model (using the default parameters)
logreg = LogisticRegression()
# fit the model with data
logreg.fit(x_train,y_train)
y_pred=logreg.predict(x_test)
from sklearn import metrics
accuracy=metrics.accuracy_score(y_test,y_pred)
accuracy
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_test, y_pred)
print(cm)
pr= metrics.precision_score(y_test, y_pred)
print("Precision score : {}".format(pr))
re= metrics.recall_score(y_test, y_pred)
print("Recall score : {}".format(re))
f1_score = metrics.f1_score(y_test, y_pred)
print("F1 score : {}".format(f1_score))
print()
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
print(classification_report(y_test, y_pred))
pred = logreg.predict(x_test)
plot_roc(y_pred,y_test)
#K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train, y_train)
# predict the response for new observations
y_pred=knn.predict(x_test)
from sklearn import metrics
accuracy=metrics.accuracy_score(y_test,y_pred)
print(accuracy)
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_test, y_pred)
print(cm)
pr = metrics.precision_score(y_test, y_pred)
print("Precision score : {}".format(pr))
re = metrics.recall_score(y_test, y_pred)
print("Recall score : {}".format(re))
f1_score = metrics.f1_score(y_test, y_pred)
print("F1 score : {}".format(f1_score))
print()
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
print(classification_report(y_test, y_pred))
plot_roc(y_pred,y_test)
#SVM
x,y = to_XY(df,"label")
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.32,random_state=0)
#Import svm model
from sklearn import svm
#Create a svm Classifier
#svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)[source]¶
clf = svm.SVC(kernel='linear',max_iter=500, degree=3, probability=True) # Linear Kernel
#Train the model using the training sets
clf.fit(x_train, y_train)
#Predict the response for test dataset
y_pred = clf.predict(x_test)
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_test, y_pred)
print(cm)
pr_score = metrics.precision_score(y_test, y_pred)
print("Precision score : {}".format(pr_score))
re_score = metrics.recall_score(y_test, y_pred)
print("Recall score : {}".format(re_score))
f1_score = metrics.f1_score(y_test, y_pred)
print("F1 score : {}".format(f1_score))
print()
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
print(classification_report(y_test, y_pred))
plot_roc(y_pred,y_test)
clf1 = svm.SVC(kernel='poly',max_iter=500, degree=8, probability=True) # Linear Kernel
#Train the model using the training sets
clf1.fit(x_train, y_train)
#Predict the response for test dataset
y_pred = clf1.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_test, y_pred)
print(cm)
pr_score = metrics.precision_score(y_test, y_pred)
print("Precision score : {}".format(pr_score))
re_score = metrics.recall_score(y_test, y_pred)
print("Recall score : {}".format(re_score))
f1_score = metrics.f1_score(y_test, y_pred)
print("F1 score : {}".format(f1_score))
print()
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
print(classification_report(y_test, y_pred))
plot_roc(y_pred,y_test)
clf2 = svm.SVC(kernel='rbf',max_iter=500, degree=3, probability=True) # Linear Kernel
#Train the model using the training sets
clf2.fit(x_train, y_train)
#Predict the response for test dataset
y_pred = clf2.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_test, y_pred)
print(cm)
pr_score = metrics.precision_score(y_test, y_pred)
print("Precision score : {}".format(pr_score))
re_score = metrics.recall_score(y_test, y_pred)
print("Recall score : {}".format(re_score))
f1_score = metrics.f1_score(y_test, y_pred)
print("F1 score : {}".format(f1_score))
print()
label = encode_text_index(df,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
print(classification_report(y_test, y_pred))
plot_roc(y_pred,y_test)
**Multi classifier
# Set the desired TensorFlow output level for this example
# tf.logging.set_verbosity(tf.logging.ERROR)
path =r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\Files"
filename = os.path.join(path,"combined_csv.csv")
df1 = pd.read_csv(filename,na_values=['NA','?','-'])
#df1 = pd.read_csv(filename,na_values=['NA','?'])
df1
df1[df1.isnull().any(axis=1)]
df1=df1.dropna()
df1
encode_text_dummy(df1, 'proto')
encode_text_dummy(df1, 'state')
encode_text_dummy(df1, 'service')
encode_text_dummy(df1,'is_sm_ips_ports')
#encode_text_dummy(df, 'attack_cat')
#encode_text_index(df,"label")
attack_cat = encode_text_index(df1,"attack_cat")
df1.drop('id',1,inplace=True)
df1.drop('label',1,inplace=True)
encode_numeric_zscore(df1, 'dur')
encode_numeric_zscore(df1, 'spkts')
encode_numeric_zscore(df1, 'dpkts')
encode_numeric_zscore(df1, 'sbytes')
encode_numeric_zscore(df1, 'dbytes')
encode_numeric_zscore(df1, 'rate')
encode_numeric_zscore(df1, 'sttl')
encode_numeric_zscore(df1, 'dttl')
encode_numeric_zscore(df1, 'sload')
encode_numeric_zscore(df1, 'dload')
encode_numeric_zscore(df1, 'sloss')
encode_numeric_zscore(df1, 'dloss')
encode_numeric_zscore(df1, 'sinpkt')
encode_numeric_zscore(df1, 'dinpkt')
encode_numeric_zscore(df1, 'sjit')
encode_numeric_zscore(df1, 'djit')
encode_numeric_zscore(df1, 'swin')
encode_numeric_zscore(df1, 'stcpb')
encode_numeric_zscore(df1, 'dtcpb')
encode_numeric_zscore(df1, 'dwin')
encode_numeric_zscore(df1, 'tcprtt')
encode_numeric_zscore(df1, 'synack')
encode_numeric_zscore(df1, 'ackdat')
encode_numeric_zscore(df1, 'smean')
encode_numeric_zscore(df1, 'dmean')
encode_numeric_zscore(df1, 'trans_depth')
encode_numeric_zscore(df1, 'response_body_len')
encode_numeric_zscore(df1, 'ct_srv_src')
encode_numeric_zscore(df1, 'ct_state_ttl')
encode_numeric_zscore(df1, 'ct_dst_ltm')
encode_numeric_zscore(df1, 'ct_src_dport_ltm')
encode_numeric_zscore(df1, 'ct_dst_sport_ltm')
encode_numeric_zscore(df1, 'ct_dst_src_ltm')
encode_numeric_zscore(df1, 'is_ftp_login')
encode_numeric_zscore(df1, 'ct_ftp_cmd')
encode_numeric_zscore(df1, 'ct_flw_http_mthd')
encode_numeric_zscore(df1, 'ct_src_ltm')
encode_numeric_zscore(df1, 'ct_srv_dst')
df1
x,y = to_xy(df1,"attack_cat")
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.34, random_state=42)
model = Sequential()
model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
model.add(Dense(10))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=0, mode='auto')
checkpointer = ModelCheckpoint(filepath=r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\bstw_extra2.hdf5", verbose=2,save_best_only=True) # save best model
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
model.load_weights(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\bstw_extra2.hdf5")
# load weights from best model
print('Training finished...Loading the best model')
print()
model.load_weights(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\bstw_extra2.hdf5") # load weights from best model
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_true = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_true, pred)
print("Final accuracy: {}".format(score))
import numpy as np
from sklearn import metrics
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_true, pred)
print(cm)
#attack_cat = encode_text_index(df1,"attack_cat")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, attack_cat)
plt.show()
pr_score = metrics.precision_score(y_true, pred,average='weighted')
print("Precision score : {}".format(pr_score))
re_score = metrics.recall_score(y_true, pred,average='weighted')
print("Recall score : {}".format(re_score))
f1_score = metrics.f1_score(y_true, pred,average='weighted')
print("F1 score : {}".format(f1_score))
print()
print(classification_report(y_true, pred))
#Using Pearson Correlation
import seaborn as sns
plt.figure(figsize=(40,40))
cor = df.corr(method="pearson")
sns.heatmap(cor, annot=True, cmap="RdYlGn")
plt.show()
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.3]
relevant_features
#Using Kendall Correlation
plt.figure(figsize=(40,40))
cor = df.corr(method ='kendall')
sns.heatmap(cor, annot=True, cmap="RdYlGn")
plt.show()
cor_target = abs(cor["label"])
#Selecting highly correlated features
rel_features = cor_target[cor_target>0.3]
rel_features
#Chi Square
from sklearn.feature_selection import chi2
x_t,y_t= to_xy(abs(df),"label")
chi_scores = chi2(x_t,y_t)
chi_scores
p_values = pd.Series(chi_scores[1])
p_values.sort_values(ascending = False , inplace = True)
p_values.plot.bar(figsize=(20,20))
df4=pd.DataFrame(df, columns=["dpkts","sbytes","dbytes","sttl","sload","dload","dinpkt","dmean"
,"ct_srv_src","ct_state_ttl","ct_src_dport_ltm","ct_dst_sport_ltm","ct_dst_src_ltm"
,"label","state-CON","state-INT","attack_cat-Generic","attack_cat-Normal"])
df4
x_sf,y_sf = to_xy(df4,"label")
# Split into train/test
x_train_sf, x_test_sf, y_train_sf, y_test_sf= train_test_split(x_sf, y_sf, test_size=0.32, random_state=42)
x_sf.shape
#activation function as tanh optimizer as sgd
# Define ModelCheckpoint outside the loop
checkpointer = ModelCheckpoint(filepath=r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\bst_weights_et3.hdf5", verbose=2, save_best_only=True) # save best model
for i in range(3):
print(i)
# Build network
model = Sequential()
model.add(Dense(10, input_dim=x_sf.shape[1], activation='tanh'))
model.add(Dense(5, activation='tanh'))
model.add(Dense(y_sf.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='sgd')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')
model.fit(x_train_sf,y_train_sf,validation_data=(x_test_sf,y_test_sf),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
print('Training finished...Loading the best model')
print()
model.load_weights(r"C:\Users\chama\OneDrive\Desktop\Gowthami\Spring_2020\CSC215\project-1\bst_weights_t2.hdf5")
# load weights from best model
# Measure accuracy
pred = model.predict(x_test_sf)
pred = np.argmax(pred,axis=1)
y_true = np.argmax(y_test_sf,axis=1)
score = metrics.accuracy_score(y_true, pred)
print("Final accuracy: {}".format(score))
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Compute confusion matrix for Fully connected Nueral Network model
cm = confusion_matrix(y_true, pred)
print(cm)
label = encode_text_index(df4,"label")
print('Plotting confusion matrix')
plt.figure()
plot_confusion_matrix(cm, label)
plt.show()
pr_score = metrics.precision_score(y_true, pred)
print("Precision score : {}".format(pr_score))
re_score = metrics.recall_score(y_true, pred)
print("Recall score : {}".format(re_score))
f1_score = metrics.f1_score(y_true, pred)
print("F1 score : {}".format(f1_score))
print()
print(classification_report(y_true, pred))